import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
ls official_train_test_split
df = pd.read_csv("official_train_test_split/80-20/train_set.csv")
df_test = pd.read_csv("official_train_test_split/80-20/test_set.csv")
df_test.tail(2)
df.record_date = pd.to_datetime(df.record_date, format='%Y-%m-%d')
df_test.record_date = pd.to_datetime(df_test.record_date, format='%Y-%m-%d')
icon_onehot_col = pd.get_dummies(df.weather_situation)
df = pd.concat([df, icon_onehot_col], axis=1).drop(["weather_situation"],axis=1)
icon_onehot_col = pd.get_dummies(df_test.weather_situation)
df_test = pd.concat([df_test, icon_onehot_col], axis=1).drop(["weather_situation"],axis=1)
boruta_1 = pd.read_excel("dataset/Boruta result.xlsx", sheet_name="Boruta result 1 ")
boruta_2 = pd.read_excel("dataset/Boruta result.xlsx", sheet_name="Boruta result 2")
vars_1 = boruta_1[boruta_1["Decision "] == "Confirmed"]["Variable"].values
vars_2 = boruta_2[boruta_2["Decision "] == "Confirmed"]["Variable"].values
df.head(2)
var1_new = ['Days since first case', 'clear-night',
'partly-cloudy-night', 'temperature', 'humidity',
'windSpeed', 'Number of Tweet', 'Sentiments', 'c1_0_action',
'c1_1_action', 'c1_2_action', 'c1_3_action', 'c2_0_action',
'c2_1_action', 'c2_2_action', 'c2_3_action', 'c3_0_action',
'c3_1_action', 'c4_0_action', 'c4_1_action', 'c4_2_action',
'c4_3_action', 'c4_4_action', 'c5_0_action', 'c5_1_action',
'c5_2_action', 'c6_0_action', 'c6_1_action', 'c6_2_action',
'c6_3_action', 'c7_0_action', 'c7_1_action', 'c7_2_action',
'c8_0_action', 'c8_1_action', 'c8_2_action', 'c8_3_action',
'c8_4_action', 'e1_0_action', 'e1_1_action', 'e1_2_action',
'e2_0_action', 'e2_1_action', 'e2_2_action', 'h1_0_action',
'h1_1_action', 'h1_2_action', 'h2_0_action', 'h2_1_action',
'h2_2_action', 'h2_3_action', 'h3_0_action', 'h3_1_action',
'h3_2_action', 'population_density',
'urbanPopulation', 'healthExpenditure', 'gdp_per_capita',
'cvd_death_rate', 'diabetes_prevalence', 'handwashing_facilities',
'hospital_beds_per_thousand', 'female_smokers', 'male_smokers',
'life_expectancy', 'Outputvalue_lag1','Outputvalue_lag2',
'Outputvalue_lag3','Outputvalue_lag4','Outputvalue_lag5',
'Outputvalue_lag6','Outputvalue_lag7',
'Month','Day_of_Month','Day_of_Week','Log_new_cases_per_million']
df_train = df[var1_new]
df_test = df_test[var1_new]
print(df_train.shape, df_test.shape)
df_train.head(2)
y_train = df_train['Log_new_cases_per_million']
X_train = df_train.drop('Log_new_cases_per_million', axis=1)
y_test = df_test['Log_new_cases_per_million']
X_test = df_test.drop('Log_new_cases_per_million', axis=1)
def get_stacking():
# define the base models
level0 = list()
# level0.append(('cart', DecisionTreeRegressor()))
level0.append(('ranFor',RandomForestRegressor(random_state=0)))
level0.append(('graBoost',GradientBoostingRegressor(random_state=0)))
level0.append(('ridge', Ridge(alpha = 10,tol=0.001)))
level0.append(('lasso', Lasso(alpha = 0.001)))
level0.append(('linear_reg', LinearRegression()))
# define meta learner model
level1 = LinearRegression()
# define the stacking ensemble
model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
return model
# get a list of models to evaluate
def get_models():
models = dict()
# models['cart'] = DecisionTreeRegressor()
models['ranFor'] = RandomForestRegressor(random_state=0)
models['graBoost'] = GradientBoostingRegressor(random_state=0)
models['ridge'] = Ridge(alpha = 0.001)
models['lasso'] = Lasso(alpha = 0.001)
models['linear_reg'] = LinearRegression()
models['stacking'] = get_stacking()
return models
# evaluate a given model using cross-validation
def evaluate_model(model):
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
scores = cross_val_score(model, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
# evaluate a given model using cross-validation
def evaluate_model(model):
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
reg = RandomForestRegressor(random_state=0)
reg.fit(X_train, y_train)
(pd.Series(reg.feature_importances_, index=X_train.columns)
.nlargest(10)
.plot(kind='barh'))
Residuals, in the context of regression models, are the difference between the observed value of the target variable (y) and the predicted value (ŷ), i.e. the error of the prediction. The residuals plot shows the difference between residuals on the vertical axis and the dependent variable on the horizontal axis, allowing you to detect regions within the target that may be susceptible to more or less error.
model = RandomForestRegressor(random_state=0)
visualizer = ResidualsPlot(model)
visualizer.fit(X_train, y_train)
visualizer.show()
from sklearn.inspection import plot_partial_dependence
reg = RandomForestRegressor(random_state=0).fit(X_train, y_train)
disp1 = plot_partial_dependence(reg, X_train,features=[1,2,3])
disp1 = plot_partial_dependence(reg, X_train,features=[3,4,5])
disp1 = plot_partial_dependence(reg, X_train,features=[6,7,8])
disp1 = plot_partial_dependence(reg, X_train,features=[9,10,11])
disp1 = plot_partial_dependence(reg, X_train,features=[12,13,14])
disp1 = plot_partial_dependence(reg, X_train,features=[15,16,17,18])
model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
model.score(X_test, y_test)
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred,squared=True)
var2_ = ['Days since first case', 'clear-night',
'partly-cloudy-day','partly-cloudy-night', 'temperature', 'humidity',
'windSpeed', 'Number of Tweet', 'Sentiments', 'c1_0_action',
'c1_1_action', 'c1_2_action', 'c1_3_action', 'c2_0_action',
'c2_1_action', 'c2_2_action', 'c2_3_action', 'c3_2_action',
'c4_0_action', 'c4_1_action', 'c4_2_action', 'c4_3_action',
'c4_4_action', 'c5_0_action', 'c5_1_action', 'c5_2_action',
'c6_0_action', 'c6_1_action', 'c6_2_action', 'c6_3_action',
'c7_0_action', 'c7_1_action', 'c7_2_action', 'c8_0_action',
'c8_1_action', 'c8_2_action', 'c8_3_action', 'c8_4_action',
'e1_0_action', 'e1_1_action', 'e1_2_action', 'e2_0_action',
'e2_1_action', 'e2_2_action', 'h1_0_action', 'h1_1_action',
'h1_2_action', 'h2_0_action', 'h2_1_action', 'h2_2_action',
'h2_3_action', 'h3_0_action', 'h3_1_action', 'h3_2_action',
'H5_Investment in vaccines', 'population_density',
'aged_65_older_sum', 'gdp_per_capita', 'cvd_death_rate',
'handwashing_facilities', 'hospital_beds_per_thousand',
'life_expectancy','Log_new_cases_per_million']
df_train = df[var2_]
df_test = df_test[var2_]
print(df_train.shape, df_test.shape)
y_train = df_train['Log_new_cases_per_million']
X_train = df_train.drop('Log_new_cases_per_million', axis=1)
y_test = df_test['Log_new_cases_per_million']
X_test = df_test.drop('Log_new_cases_per_million', axis=1)
def get_stacking():
# define the base models
level0 = list()
level0.append(('xgboost',xgboost.XGBRegressor()))
level0.append(('ranFor',RandomForestRegressor(random_state=0)))
level0.append(('graBoost',GradientBoostingRegressor(random_state=0)))
level0.append(('ridge', Ridge(alpha = 10,tol=0.001)))
level0.append(('lasso', Lasso(alpha = 0.001)))
level0.append(('linear_reg', LinearRegression()))
# define meta learner model
level1 = LinearRegression()
# define the stacking ensemble
model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
return model
# get a list of models to evaluate
def get_models():
models = dict()
models['xgboost'] = xgboost.XGBRegressor()
models['ranFor'] = RandomForestRegressor(random_state=0)
models['graBoost'] = GradientBoostingRegressor(random_state=0)
models['ridge'] = Ridge(alpha = 0.001)
models['lasso'] = Lasso(alpha = 0.001)
models['linear_reg'] = LinearRegression()
models['stacking'] = get_stacking()
return models
# evaluate a given model using cross-validation
def evaluate_model(model):
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
scores = cross_val_score(model, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
# evaluate a given model using cross-validation
def evaluate_model(model):
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
reg = RandomForestRegressor(random_state=0)
reg.fit(X_train, y_train)
(pd.Series(reg.feature_importances_, index=X_train.columns)
.nlargest(10)
.plot(kind='barh'))
model = RandomForestRegressor(random_state=0)
visualizer = ResidualsPlot(model)
visualizer.fit(X_train, y_train)
visualizer.show()
from sklearn.metrics import r2_score
model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)
# model.score(X_test, y_test) # =R2
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred,squared=True)
from sklearn.inspection import plot_partial_dependence
disp1 = plot_partial_dependence(model, X_train,features=[1,2,3])
disp1 = plot_partial_dependence(reg, X_train,features=[3,4,5])
disp1 = plot_partial_dependence(reg, X_train,features=[6,7,8])
disp1 = plot_partial_dependence(reg, X_train,features=[9,10,11])
disp1 = plot_partial_dependence(reg, X_train,features=[12,13,14])
disp1 = plot_partial_dependence(reg, X_train,features=[15,16,17])
disp1 = plot_partial_dependence(reg, X_train,features=[18,19,20])
disp1 = plot_partial_dependence(reg, X_train,features=[21,22,23])
disp1 = plot_partial_dependence(reg, X_train,features=[24,25,26])
disp1 = plot_partial_dependence(reg, X_train,features=[27,28,29])
disp1 = plot_partial_dependence(reg, X_train,features=[30,31,32])
disp1 = plot_partial_dependence(reg, X_train,features=[33,35,36])
disp1 = plot_partial_dependence(reg, X_train,features=[37,38,39])
disp1 = plot_partial_dependence(reg, X_train,features=[40,41,42])
disp1 = plot_partial_dependence(reg, X_train,features=[43,44,45])
disp1 = plot_partial_dependence(reg, X_train,features=[46,47,48])
disp1 = plot_partial_dependence(reg, X_train,features=[49,50,51])
disp1 = plot_partial_dependence(reg, X_train,features=[52,53,54])
disp1 = plot_partial_dependence(reg, X_train,features=[55,56,57])
disp1 = plot_partial_dependence(reg, X_train,features=[58,59,60])
import shap
# load JS visualization code to notebook
shap.initjs()
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])
The above explanation shows features each contributing to push the model output from the base value to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue. The base_value here is 0.4979 while our predicted value is 0.7. Goal Scored = 2 has the biggest impact on increasing the prediction, while ball possession feature has the biggest effect in decreasing the prediction.
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X_train)